Loading the libraries we gonna use

library(dplyr)
library(tidyjson)
library(rjson)
library(lubridate)
library(GGally)
library(ggplot2)
library(xts)
library(zoo)
library(data.table)
library(heatmaply)
library(reshape2)
library(factoextra)
library(pca3d)
library(forecast)
library(caret)
library(corrplot)

Loading the datasets, one for each month

jan <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-01-01 00:00/2020-01-31 23:59")
feb <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-02-01 00:00/2020-02-31 23:59")
mar <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-03-01 00:00/2020-03-31 23:59")
apr <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-04-01 00:00/2020-04-31 23:59")
may <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-05-01 00:00/2020-05-31 23:59")
jun <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-06-01 00:00/2020-06-31 23:59")

Extracting the values

We gonna extract the values from our dataset. We gonna create tables for every station for all six months.

We gonna use the tidyjson library and spread_all function to spread the values and create a matrix in which

every row is a hourly measures and details for all types of pollution. Then we gonna save the matrix as .csv file.

Station 1

# creating the list of values for station 1 for all six months
station1 <- c(jan$data$station_1$values,feb$data$station_1$values,mar$data$station_1$values,
          apr$data$station_1$values,may$data$station_1$values,jun$data$station_1$values)
station1 <- station1 %>% spread_all() # spreading the list to one data.frame
station1$..JSON <- NULL # deleting the last column
write.csv(station1,"station1.csv",row.names = F) # saving the data.frame

Station 2

station2 <- c(jan$data$station_2$values,feb$data$station_2$values,mar$data$station_2$values,
          apr$data$station_2$values,may$data$station_2$values,jun$data$station_2$values)
station2 <- station2 %>% spread_all()
station2$..JSON <- NULL
write.csv(station2,"station2.csv", row.names = F)

Station 3

station3 <- c(jan$data$station_3$values,feb$data$station_3$values,mar$data$station_3$values,
              apr$data$station_3$values,may$data$station_3$values,jun$data$station_3$values)
station3 <- station3 %>% spread_all()
station3$..JSON <- NULL
write.csv(station3,"station3.csv", row.names = F)

Station 5

station5 <- c(jan$data$station_5$values,feb$data$station_5$values,mar$data$station_5$values,
              apr$data$station_5$values,may$data$station_5$values,jun$data$station_5$values)
station5 <- station5 %>% spread_all()
station5$..JSON <- NULL
write.csv(station5,"station5.csv", row.names = F)

Station 7

station7 <- c(jan$data$station_7$values,feb$data$station_7$values,mar$data$station_7$values,
              apr$data$station_7$values,may$data$station_7$values,jun$data$station_7$values)
station7 <- station7 %>% spread_all()
station7$..JSON <- NULL
write.csv(station7,"station7.csv", row.names = F)

Station 8

station8 <- c(jan$data$station_8$values,feb$data$station_8$values,mar$data$station_8$values,
              apr$data$station_8$values,may$data$station_8$values,jun$data$station_8$values)
station8 <- station5 %>% spread_all()
station8$..JSON <- NULL
write.csv(station8,"station8.csv", row.names = F)

Station 9

station9 <- c(jan$data$station_9$values,feb$data$station_9$values,mar$data$station_9$values,
              apr$data$station_9$values,may$data$station_9$values,jun$data$station_9$values)
station9 <- station5 %>% spread_all()
station9$..JSON <- NULL
write.csv(station9,"station9.csv", row.names = F)

Station 10

station10 <- c(jan$data$station_10$values,feb$data$station_10$values,mar$data$station_10$values,
              apr$data$station_10$values,may$data$station_10$values,jun$data$station_10$values)
station10 <- station5 %>% spread_all()
station10$..JSON <- NULL
write.csv(station10,"station10.csv", row.names = F)

Station 11

station11 <- c(jan$data$station_11$values,feb$data$station_11$values,mar$data$station_11$values,
              apr$data$station_11$values,may$data$station_11$values,jun$data$station_11$values)
station11 <- station5 %>% spread_all()
station11$..JSON <- NULL
write.csv(station11,"station11.csv", row.names = F)

creating a vector with stations names

stations <- c("Nicosia_Traf","Nicosia_Resi","Limassol_Traf","Larnaca_Traf","Paphos_Traf",
              "Zygi_Indu","Mari_Indu","Ayia_Marina_Xylianou_Back","Paralimni_Traf")

Data preprocessing

We gonna load the datasets for every station, filter the columns and keep the values for every pollutant type. We’ll change the class of time to POSIXct, check for NAs and fill them using na.approx function

Station 1

s1 <- read.csv("station1.csv") # Loading the .csv file we previous create
s1 <- s1 %>% select(c(seq(2,38,4))) # Filtering the columns with pollutant values
names(s1) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25") #adding the column names
s1$time <- ymd_hms(s1$time) # changing the type of time
sapply(s1, function(x) sum(is.na(x))) # checking for NA
## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0   99  325   35  104  104  371   96  104  325
# Filling NA with na.approx function
s1[,2:10] <- sapply(s1[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = T), x))
sapply(s1, function(x) sum(is.na(x))) # zero NA
## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0    0    0    0    0    0    0    0    0    0
knitr::kable(head(s1,3)) # printing the first 3 rows
time SO2 PM10 O3 NO2 NOx CO C6H6 NO PM25
2020-01-01 00:00:00 4.213296 113.18510 5.422678 41.66263 64.42079 1045.5090 5.304624 14.83504 89.80502
2020-01-01 01:00:00 3.986767 49.54911 7.720096 41.01508 90.58797 977.6353 2.203252 32.31437 35.75103
2020-01-01 02:00:00 4.940188 64.78768 4.493460 50.22812 119.87520 1263.8940 3.149906 45.39985 35.52024

Station 2

s2 <- read.csv("station2.csv")
s2 <- s2 %>% select(c(seq(2,26,4)))
names(s2) <- c("time","SO2","O3","NO2","NOx","CO","NO")
s2$time <- ymd_hms(s2$time)
sapply(s2, function(x) sum(is.na(x)))
## time  SO2   O3  NO2  NOx   CO   NO 
##    0  176   44  100   84   89   50
s2[,2:7] <- sapply(s2[,2:7], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 3

s3 <- read.csv("station3.csv")
s3 <- s3 %>% select(c(seq(2,30,4)))
names(s3) <- c("time","SO2","PM10","O3","NO2","NOx","CO","NO")
s3$time <- ymd_hms(s3$time)
sapply(s3, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx   CO   NO 
##    0  723  128  124  227  227  709  253
s3[,2:8] <- sapply(s3[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 5

s5 <- read.csv("station5.csv")
s5 <- s5 %>% select(c(seq(2,38,4)))
names(s5) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s5$time <- ymd_hms(s5$time)
sapply(s5, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0  199  182   97   64   64   98  183   95  195
s5[,2:10] <- sapply(s5[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 7

s7 <- read.csv("station7.csv")
s7 <- s7 %>% select(c(seq(2,38,4)))
names(s7) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s7$time <- ymd_hms(s7$time)
sapply(s7, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0  665  405  121  165  164  332  530  206  479
s7[,2:10] <- sapply(s7[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 8

s8 <- read.csv("station8.csv")
s8 <- s8 %>% select(c(seq(2,34,4)))
names(s8) <- c("time","SO2","PM10","O3","NO2","NOx","C6H6","NO","PM25")
s8$time <- ymd_hms(s8$time)
sapply(s8, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx C6H6   NO PM25 
##    0  202  138   76   83   83  759   89  141
s8[,2:9] <- sapply(s8[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 9

s9 <- read.csv("station9.csv")
s9 <- s9 %>% select(c(seq(2,30,4)))
names(s9) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO")
s9$time <- ymd_hms(s9$time)
sapply(s9, function(x) sum(is.na(x)))
## time  SO2   O3  NO2  NOx   CO C6H6   NO 
##    0  180   45  108  108  626  938  323
s9[,2:8] <- sapply(s9[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 10

s10 <- read.csv("station10.csv")
s10 <- s10 %>% select(c(seq(2,38,4)))
names(s10) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","PM25","NO")
s10$time <- ymd_hms(s10$time)
sapply(s10, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx   CO C6H6 PM25   NO 
##    0  305 1057  107  177  177  224  311 1095  426
s10[,2:10] <- sapply(s10[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 11

s11 <- read.csv("station11.csv")
s11 <- s11 %>% select(c(seq(2,34,4)))
names(s11) <- c("time","SO2","O3","NO2","NOx","CO","NO","PM10","PM25")
s11$time <- ymd_hms(s11$time)
sapply(s11, function(x) sum(is.na(x)))
## time  SO2   O3  NO2  NOx   CO   NO PM10 PM25 
##    0  275  113  121  121   83  271  864  864
s11[,2:9] <- sapply(s11[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Changing the time

Next step is gonna be to change the time. Our dataset is in hours so we gonna create two new datasets for every station in which we gonna store mean value for each day and for each month. First we’ll transform the datasets to xts type and then create .d and .m datasets for days and months.

Station 1

s1.xts <- xts(s1, order.by = s1$time) # creating the xts type of s1 dataset
s1.d <- period.apply(s1.xts[,2:10] , endpoints(s1.xts, "days"), mean) #finding the mean for every day
s1.m <- period.apply(s1.xts[,2:10] , endpoints(s1.xts, "months"), mean) #finding the mean for every month
s1.d <- as.data.frame(s1.d) # changing for xts to data.frame type
s1.m <- as.data.frame(s1.m) # changing for xts to data.frame type

Station 2

s2.xts <- xts(s2, order.by = s2$time)
s2.d <- period.apply(s2.xts[,2:7] , endpoints(s2.xts, "days"), mean)
s2.m <- period.apply(s2.xts[,2:7] , endpoints(s2.xts, "months"), mean)
s2.d <- as.data.frame(s2.d)
s2.m <- as.array.default(s2.m)
# Creating a time column to join the datasets later. Only for the stations with missing days
s2.d$time <- as.POSIXct(row.names(s2.d),format = "%Y-%m-%d")

Station 3

s3.xts <- xts(s3, order.by = s3$time)
s3.d <- period.apply(s3.xts[,2:8] , endpoints(s3.xts, "days"), mean)
s3.m <- period.apply(s3.xts[,2:8] , endpoints(s3.xts, "months"), mean)
s3.d <- as.data.frame(s3.d)
s3.m <- as.data.frame(s3.m)

Station 5

s5.xts <- xts(s5, order.by = s5$time)
s5.d <- period.apply(s5.xts[,2:10] , endpoints(s5.xts, "days"), mean)
s5.m <- period.apply(s5.xts[,2:10] , endpoints(s5.xts, "months"), mean)
s5.d <- as.data.frame(s5.d)
s5.m <- as.data.frame(s5.m)

Station 7

s7.xts <- xts(s7, order.by = s7$time)
s7.d <- period.apply(s7.xts[,2:10] , endpoints(s7.xts, "days"), mean)
s7.m <- period.apply(s7.xts[,2:10] , endpoints(s7.xts, "months"), mean)
s7.d <- as.data.frame(s7.d)
s7.m <- as.data.frame(s7.m)
s7.d$time <- as.POSIXct(row.names(s7.d),format = "%Y-%m-%d")

Station 8

s8.xts <- xts(s8, order.by = s8$time)
s8.d <- period.apply(s8.xts[,2:9] , endpoints(s8.xts, "days"), mean)
s8.m <- period.apply(s8.xts[,2:9] , endpoints(s8.xts, "months"), mean)
s8.d <- as.data.frame(s8.d)
s8.m <- as.data.frame(s8.m)
s8.d$time <- as.POSIXct(row.names(s8.d),format = "%Y-%m-%d")

Station 9

s9.xts <- xts(s9, order.by = s9$time)
s9.d <- period.apply(s9.xts[,2:8] , endpoints(s9.xts, "days"), mean)
s9.m <- period.apply(s9.xts[,2:8] , endpoints(s9.xts, "months"), mean)
s9.d <- as.data.frame(s9.d)
s9.m <- as.data.frame(s9.m)

Station 10

s10.xts <- xts(s10, order.by = s10$time)
s10.d <- period.apply(s10.xts[,2:10] , endpoints(s10.xts, "days"), mean)
s10.m <- period.apply(s10.xts[,2:10] , endpoints(s10.xts, "months"), mean)
s10.d <- as.data.frame(s10.d)
s10.m <- as.data.frame(s10.m)

Station 11

s11.xts <- xts(s11, order.by = s11$time)
s11.d <- period.apply(s11.xts[,2:9] , endpoints(s11.xts, "days"), mean)
s11.m <- period.apply(s11.xts[,2:9] , endpoints(s11.xts, "months"), mean)
s11.d <- as.data.frame(s11.d)
s11.m <- as.data.frame(s11.m)

deleting the datasets we dont gonna use

rm(s1,s1.xts,s10, s10.xts, s11, s11.xts,s2,s2.xts,s3, s3.xts, s5, s5.xts, 
   s7, s7.xts, s8, s8.xts,s9, s9.xts, s10, s10.xts, s11, s11.xts)

PM10 Analysis

Creating the dataframe for pm10 with day columns

# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
pm10 <- data.frame("station1"=s1.d$PM10,"station3"=s3.d$PM10,"station5"=s5.d$PM10,
                   "station10"=s10.d$PM10,"station11"=s11.d$PM10)
# Creating a time column to help us join previous dataframe with station 7 and 8.
pm10$time <- as.POSIXct(row.names(s1.d),format = "%Y-%m-%d")
# Full join pm10 and station7 by time column
pm10 <- full_join(pm10,s7.d[,c("time","PM10")],by="time")
# Full join pm10 and station8 by time column
pm10 <- full_join(pm10,s8.d[,c("time","PM10")],by="time")
# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
pm10 <- pm10 %>% rename(station7=PM10.x, station8=PM10.y) %>%
  select(time,station1,station3,station5,station7,station8,station10,station11) %>%
  mutate(station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
         station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8)) 
# Adding a mean column with mean value of PM10 for every day
pm10$mean <- rowMeans(pm10[,2:8])
knitr::kable(head(pm10,3)) # printing the first 3 columns
time station1 station3 station5 station7 station8 station10 station11 mean
2020-01-01 42.89521 18.62904 34.34300 20.03441 26.282481 3.927083 26.23751 24.62125
2020-01-02 20.59653 10.59705 23.11284 17.06729 10.038982 3.721084 30.37896 16.50182
2020-01-03 10.83952 11.74851 13.33836 16.47911 6.818125 3.705833 30.29477 13.31775

Creating the dataframe for pm10 with monthly columns

# adding the values to dataframe
pm10.m <- data.frame(s1.m$PM10,s3.m$PM10,s5.m$PM10,s7.m$PM10,
                     s8.m$PM10,s10.m$PM10,s11.m$PM10)
# Creating a column with mean values for every month
pm10.m$mean <- rowMeans(pm10.m)
# Adding row and column names
rownames(pm10.m) <- c("January","February","March","April","May","June") 
colnames(pm10.m) <- c(stations[c(1,3:6,8:9)],"mean")
knitr::kable(pm10.m, digits = 4) #printing the table
Nicosia_Traf Limassol_Traf Larnaca_Traf Paphos_Traf Zygi_Indu Ayia_Marina_Xylianou_Back Paralimni_Traf mean
January 36.3907 25.0343 30.1276 25.8627 16.7807 7.6372 28.1558 24.2841
February 30.6833 27.2613 28.6184 26.1846 21.0343 11.2538 28.1471 24.7404
March 30.3524 27.4642 33.2767 26.4542 25.0162 20.2937 29.5287 27.4837
April 25.1905 23.7127 27.7513 21.3120 25.6438 19.0282 25.3662 24.0007
May 35.7941 35.3375 36.1732 35.4699 35.6575 25.1839 32.6934 33.7585
June 25.9754 26.6133 34.1944 23.4294 32.6121 18.3832 31.7686 27.5681

Summary statistics of PM10 for each station

colnames(pm10) <- c("time",stations[c(1,3:6,8:9)],"mean")
knitr::kable(do.call(cbind, lapply(pm10[2:9], summary)))
Nicosia_Traf Limassol_Traf Larnaca_Traf Paphos_Traf Zygi_Indu Ayia_Marina_Xylianou_Back Paralimni_Traf mean
Min. 10.83952 10.00244 11.00115 9.065349 4.911398 3.240060 12.13500 12.47557
1st Qu. 21.24346 19.09453 22.15374 18.909459 17.613646 9.620417 21.06490 20.33870
Median 26.44153 25.34259 28.60686 24.125281 22.768948 14.252989 26.51567 24.25839
Mean 30.78019 27.58017 31.70585 26.173728 26.282481 17.039248 29.27912 26.97726
3rd Qu. 37.40693 32.06226 36.45506 29.643809 30.963913 19.605729 33.07865 29.77486
Max. 97.42616 94.58720 96.06582 89.734498 112.569130 82.874167 82.96542 90.03731

Data preperation

pm10.t <- as.data.frame(t(as.matrix(pm10))) # transpose the pm10 matrix
# changing row and column names
rownames(pm10.t) <- c("time",stations[c(1,3:6,8:9)],"mean")
colnames(pm10.t) <- pm10$time
pm10.bar2 <- pm10.t[2:8,]
pm10.bar2$row.names <- rownames(pm10.bar2)
# changing the data from wide format to a single column of data
pm10.bar2<-melt(pm10.bar2,id=c("row.names"))
# Changing the type of class
pm10.bar2$value <- as.numeric(pm10.bar2$value)
pm10.bar2$variable <- ymd(pm10.bar2$variable)
pm10.bar2$row.names <- as.factor(pm10.bar2$row.names)

Boxplot for each station

ggplot(data = pm10.bar2, aes(x = row.names, y = value, fill=row.names)) +geom_boxplot() +
  theme(legend.position = "none") + labs(title="PM10 boxplot for all stations", x="Stations",y="PM10") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

PM10 time series.

The green line is the air quality guideline from WHO and the pink line is the established EU limit value. The dashed orange line is the date Cyprus banned the flights and the red dashed line is the date the lockdown started. We can see there are some peaks as a result of dust episodes. We can see that COVID19 situation did not affect the values of PM10.

ggplot(pm10, aes(x=time,y=mean)) + geom_line() + geom_smooth() +
  geom_vline(aes(xintercept=as.numeric(pm10$time[84]),colour="Lockdown"),linetype=2, size=1) +
  geom_vline(aes(xintercept=as.numeric(pm10$time[81]),colour="Flight_ban"),linetype=2, size=1) +
  labs(y="PM10 value", x="Month",title="January to June 2020 Time series for PM10") + 
  geom_hline(aes(yintercept = 40, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 20, colour="WHO_air_quality_guideline"), linetype=1) +
  scale_color_manual(name = "Line notation", values = c(Lockdown="red",EU_limit_value="magenta",
                           WHO_air_quality_guideline="green",Flight_ban="orange")) +
  theme_classic() +
  annotate("text",x=pm10$time[140],y = 95,label="Dust Episode", size =4, fontface="bold", color="brown" )

Time Series plot for every station

ggplot(pm10.bar2, aes(x=variable,y=as.numeric(value))) + geom_area(aes(color = row.names, fill = row.names),
    alpha = 0.7) + theme_bw() +
    labs(y="PM10 value", x="Month",title="January to June 2020 PM10 Time series for every station") +
    labs(fill='Station') + labs(color="Station") 

ggplot(pm10.bar2, aes(x=variable,y=as.numeric(value))) + geom_line(aes(color = row.names),
    alpha = 0.7, size=0.7) + theme_classic() +
    labs(y="PM10 value", x="Month",title="January to June 2020 PM10 Time series for every station") +
    labs(color="Station") 

PM10 daily difference

COVID19 situation doesn’t change the stability of PM10 and the differences seems to be almost the same before and after lockdown. The highest differences can be spotted the first 2 weeks after lockdown.

pm10.ts <- ts(pm10[9], start=c(2020,1) , frequency = 365)
pm10.diff <- diff(pm10.ts)
pm10.dif <- data.frame("time"=pm10$time,"diff"=c(0,pm10.diff)) 
ggplot(pm10.dif, aes(x=time,y=diff))+geom_line(color=I("red")) + xlab("Months")+ ylab("PM10 difference") +
  ggtitle("PM10 daily difference") + theme_classic() + geom_hline(yintercept = 0, linetype="dashed")

Heatmap plot of monthly values

pm10.mt <- as.data.frame(t(as.matrix(pm10.m))) ##transpose of month matrix
rownames(pm10.mt) <- c(stations[c(1,3:6,8:9)],"mean") #adding row names
heatmaply(pm10.mt, grid_color = "white", dendrogram = "none", main = "PM10 per month for all Stations and mean values",xlab = "Months",ylab="Stations",cellnote = round(pm10.mt,2))

Barplot of PM10 for every month and visual contribution of each station

pm10.bar <- pm10.mt[1:7,] # row filtering
pm10.bar$row.names <- rownames(pm10.bar) # adding column with row names
# changing the data from wide format to a single column of data
pm10.bar<-melt(pm10.bar,id=c("row.names"))
# creating the plot
ggplot(pm10.bar,aes(x=variable,y=value/7,fill=row.names))+geom_bar(position="stack", stat="identity") +
  labs(title = "PM10 per month and station contribution", y = "PM10 value", x = "Months") +
  labs(fill = "Stations")

Hierarchical Analysis

I choose to create three clusters. The clusters seems to be very reasonal. The green cluster includes four of five traffic stations. The red includes only Ayia_Marina_Xyliaroy station, the only Rural Station of our dataset and the blue includes Paphos traffic station and Zygi industrial station.

pm10.t2 <- as.data.frame(t(as.matrix(pm10[,2:8])))
row.names(pm10.t2)<- stations[c(1,3:6,8:9)]
pm10.t2$station <- stations[c(1,3:6,8:9)]
pm10.scaled <- as.data.frame(scale(pm10.t2[,1:182])) # scaling
dist_pm10 <- dist(pm10.scaled, method = 'euclidean') # calculating the distance
hclust_pm10 <- hclust(dist_pm10, method = 'complete') # creating the HC
# plotting th dendrogram
plot(hclust_pm10,labels=pm10.t2$station)
rect.hclust(hclust_pm10 ,  k = 3,border = 2:6) # adding the 3 cluster boxes

Elbow rule.

We can see that a good choice would be to choose k=2

fviz_nbclust(pm10.scaled, kmeans, method = "wss", k.max=6)

PCA

pm10_pca <- prcomp(pm10.t2[,-183], center = TRUE, scale. = TRUE)
summary(pm10_pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4    PC5     PC6       PC7
## Standard deviation     9.0772 5.8317 4.9680 4.17065 3.4946 3.36293 4.183e-15
## Proportion of Variance 0.4527 0.1869 0.1356 0.09557 0.0671 0.06214 0.000e+00
## Cumulative Proportion  0.4527 0.6396 0.7752 0.87076 0.9379 1.00000 1.000e+00

Cluster plot with PCA

This biplot for the first two principal components, accounting for 64% of the total variance

sub_grp <- cutree(hclust_pm10, k = 3)
fviz_cluster(list(data = pm10.scaled, cluster = sub_grp))

PCA 3d plot

This 3d plot for the first three principal components, accounting for 77% of the total variance

pca3d(pm10_pca, group=sub_grp, radius = 4, show.labels=T)
## [1] 1.4917757 0.7329541 0.5801611
## Creating new device
PCA 3d

PCA 3d

NOx Analysis

Creating the dataframe for NOx with day columns

nox <- data.frame("station1"=s1.d$NOx,"station3"=s3.d$NOx ,"station5"=s5.d$NOx , 
                   "station9"=s9.d$NOx,"station10"=s10.d$NOx,"station11"=s11.d$NOx)
nox$time <- as.POSIXct(row.names(s1.d),format = "%Y-%m-%d")
nox <- full_join(nox,s2.d[,c("time","NOx")],by="time")
nox <- full_join(nox,s7.d[,c("time","NOx")],by="time")
nox <- full_join(nox,s8.d[,c("time","NOx")],by="time")

nox <- nox %>% rename(station2=NOx.x, station7=NOx.y, station8=NOx) %>%
  select(time,station1,station2,station3,station5,station7,station8,
         station9,station10,station11) %>%
  mutate(station2=ifelse(is.na(station2),mean(station2, na.rm=T),station2),
         station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
         station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8))
nox$mean <- rowMeans(nox[,2:10])
knitr::kable(head(nox,3))
time station1 station2 station3 station5 station7 station8 station9 station10 station11 mean
2020-01-01 82.36372 45.79606 40.62136 58.09139 25.14944 12.34307 8.669477 1.704961 35.21159 34.43901
2020-01-02 56.73409 40.99054 55.56007 45.45797 26.65660 13.26133 10.280462 2.418991 39.59478 32.32831
2020-01-03 42.54235 49.56395 86.81541 65.22078 36.50002 12.76792 14.621379 2.824170 39.55709 38.93479

Creating the dataframe for pm10 with monthly columns

nox.m <- data.frame(s1.m$NOx,s2.m$NOx,s3.m$NOx,
                    s5.m$NOx,s7.m$NOx,s8.m$NOx,
                    s9.m$NOx,s10.m$NOx,s11.m$NOx)
nox.m$Mean <- rowMeans(nox.m)
row.names(nox.m)<-c("January","February","March","April","May","June")
colnames(nox.m)<-c(stations,"Mean")
knitr::kable(nox.m, digits = 2)
Nicosia_Traf Nicosia_Resi Limassol_Traf Larnaca_Traf Paphos_Traf Zygi_Indu Mari_Indu Ayia_Marina_Xylianou_Back Paralimni_Traf Mean
January 91.52 52.55 65.59 59.05 30.98 13.16 12.87 2.92 42.54 41.24
February 63.56 42.07 57.72 44.19 24.97 15.39 14.25 3.67 30.46 32.92
March 33.44 21.28 34.36 29.16 15.23 13.05 10.73 3.22 16.98 19.72
April 15.85 8.54 13.42 11.89 6.16 7.63 7.94 2.38 9.91 9.30
May 25.51 12.79 23.90 20.43 16.51 9.49 10.92 2.28 14.14 15.11
June 23.31 14.33 27.21 20.37 10.07 15.36 18.16 1.82 15.09 16.19
colnames(nox) <- c("time",stations,"mean")
knitr::kable(do.call(cbind, lapply(nox[2:11], summary)))
Nicosia_Traf Nicosia_Resi Limassol_Traf Larnaca_Traf Paphos_Traf Zygi_Indu Mari_Indu Ayia_Marina_Xylianou_Back Paralimni_Traf mean
Min. 4.381602 1.701600 4.374026 3.925617 1.096979 3.354127 4.109603 0.8867552 4.751423 4.282998
1st Qu. 18.641329 9.213398 17.151341 14.350127 6.056446 8.697741 8.450774 2.0224395 10.675313 12.574391
Median 28.274242 16.176603 30.812245 22.375224 15.017890 12.035261 10.922486 2.4228941 15.611197 17.676432
Mean 42.206705 25.325023 36.994232 30.840662 17.387691 12.343074 12.448582 2.7189122 21.518156 22.420337
3rd Qu. 54.810168 39.055830 52.007920 44.548574 25.405847 15.280685 15.560341 3.0809064 24.351356 31.687493
Max. 158.532276 100.709014 139.149394 126.046744 75.349972 27.850411 28.655146 8.9129858 113.762156 68.516004

data preperation

nox.t <- as.data.frame(t(as.matrix(nox)))
# changing row and column names
colnames(nox.t) <- nox$time
nox.bar <- nox.t[2:10,]
nox.bar$row.names <- rownames(nox.bar)
# changing the data from wide format to a single column of data
nox.bar<-melt(nox.bar,id=c("row.names"))
nox.bar$value <- as.numeric(nox.bar$value)
nox.bar$variable <- ymd(nox.bar$variable)
nox.bar$row.names <- as.factor(nox.bar$row.names)

Boxplot for each station

ggplot(data = nox.bar, aes(x = row.names, y = value, fill=row.names)) +geom_boxplot() +
  theme(legend.position = "none") + labs(title="PM10 boxplot for all stations", x="Stations",y="PM10") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

NOx time series

The green line is the air quality guideline from WHO and the pink line is the established EU limit value. The dashed orange line is the date Cyprus banned the flights and the red dashed line is the date the lockdown started. Some peaks are a result of dust episodes. We can see that COVID19 situation affect the values of NOx.

ggplot(nox, aes(x=time,y=mean)) + geom_line() + geom_smooth() +
  geom_vline(aes(xintercept=as.numeric(pm10$time[84]),colour="Lockdown"),linetype=2, size=1) +
  geom_vline(aes(xintercept=as.numeric(pm10$time[81]),colour="Flight_ban"),linetype=2, size=1) +
  geom_hline(aes(yintercept = 30, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 40, colour="WHO_air_quality_guideline"), linetype=1) + 
  labs(y="NOx value", x="Month",title="January to June 2020 Time series for NOx") + 
  scale_color_manual(name = "statistics", values = c(Lockdown="red",EU_limit_value="magenta",
                           Flight_ban="orange",WHO_air_quality_guideline="green")) +
  theme_classic() + 
  annotate("text",x=pm10$time[140],y = 37,label="Dust Episode", size =4, fontface="bold", color="brown" )

Time Series plot for every station

ggplot(nox.bar, aes(x=variable,y=as.numeric(value))) + geom_area(aes(fill = row.names, colour=row.names),
    alpha = 0.3) + theme_bw() +
    labs(y="NOx value", x="Month",title="January to June 2020 NOx Time series for every station") +
    labs(fill="Station") + labs(colour="Station")

ggplot(nox.bar, aes(x=variable,y=as.numeric(value))) + geom_line(aes(colour=row.names), alpha=0.7, size=0.7) + theme_bw() +
    labs(y="NOx value", x="Month",title="January to June 2020 NOx Time series for every station") +
    labs(colour="Station")

NOx daily difference

After COVID19 situation we can see that the daily difference is low and NOx seems to be more stable.

nox.ts <- ts(nox[11], start=c(2020,1) , frequency = 365)
nox.diff <- diff(nox.ts)
nox.diff <- data.frame("time"=nox$time,"diff"=c(0,nox.diff)) 
ggplot(nox.diff, aes(x=time,y=diff))+geom_line(color=I("red")) + xlab("Months")+ ylab("NOx difference") +
  ggtitle("NOx daily difference") + theme_classic() + geom_hline(yintercept = 0, linetype="dashed")

Heatmap plot for NOx monthly values for each station

nox.mt <- as.data.frame(t(as.matrix(nox.m)))
heatmaply(nox.mt, grid_color = "white", dendrogram = "none", main = "NOx per month for all Stations and mean values",xlab = "Months",ylab="Stations",cellnote = round(nox.mt,2))

Barplot of NOx for every month and visual contribution of each station

nox.bar2 <- nox.mt[1:9,]
nox.bar2$row.names <- rownames(nox.bar2) # adding column with row names
# changing the data from wide format to a single column of data
nox.bar2<-melt(nox.bar2,id=c("row.names"))
# creating the plot
ggplot(nox.bar2,aes(x=variable,y=value/9,fill=row.names))+geom_bar(position="stack", stat="identity") +
  labs(title = "NOx per month and station contribution", y = "NOx value", x = "Months") +
  labs(fill = "Stations")

Hierarchical Analysis

Performing a Hierarchical Cluster analysis, calculating the distance between two points using euclidean distance and complete method to calculate the distance between clusters. Ayia Marina Xylianou has her own cluster, the red cluster includes the traffic stations of the three big cities of Cyprus and the blue cluster includes the traffic station of smallers cities like Paralimni and Paphos, the industrial stations and the Nicosia residential station. We can assume red cluster has the highest NOx values, green cluster has the lowest NOx values and blue cluster has NOx values between green and red.

nox.t2 <- as.data.frame(t(as.matrix(nox[,2:10])))
nox.t2$station <- stations
nox.scaled <- as.data.frame(scale(nox.t2[,1:182]))
dist_nox <- dist(nox.scaled, method = 'euclidean') # calculating the distance
hclust_nox <- hclust(dist_nox, method = 'complete') # creating the HC
# plotting th dendrogram
plot(hclust_nox,labels=nox.t2$station)
rect.hclust(hclust_nox ,  k = 3,border = 2:6) # adding the 3 cluster boxes

Elbow rule for NOx

fviz_nbclust(nox.scaled, kmeans, method = "wss", k.max=8)

PCA

nox_pca <- prcomp(nox.t2[,-183], center = TRUE, scale. = TRUE)
summary(nox_pca)
## Importance of components:
##                            PC1    PC2     PC3     PC4    PC5    PC6     PC7
## Standard deviation     11.1594 4.3701 3.69591 2.69370 2.4766 2.2210 1.96366
## Proportion of Variance  0.6842 0.1049 0.07505 0.03987 0.0337 0.0271 0.02119
## Cumulative Proportion   0.6842 0.7892 0.86423 0.90410 0.9378 0.9649 0.98609
##                            PC8       PC9
## Standard deviation     1.59114 2.641e-15
## Proportion of Variance 0.01391 0.000e+00
## Cumulative Proportion  1.00000 1.000e+00

Cluster plot with PCA

This biplot for the first two principal components, accounting for 79% of the total variance

sub_grp2 <- cutree(hclust_nox, k = 3)
fviz_cluster(list(data = nox.scaled, cluster = sub_grp2))

PCA 3d plot

This 3d plot for the first three principal components, accounting for 86% of the total variance

pca3d(nox_pca, group=sub_grp2, radius = 4, show.labels=T)
## [1] 1.5326401 0.4950112 0.4708179
NOx PCA 3d

NOx PCA 3d

Loading the covid database

covid <- read.csv("owid-covid-data.csv")
covid <- covid %>% filter(location=="Cyprus") # filtering the Cyprus data
covid$date <- ymd(covid$date) # changing the date class

Joining the covid and pm10 dataset

pm10$time <- ymd(pm10$time) # changing the time class
# left join the two dataset using time and date column
pm10.cov <- merge(pm10, covid, by.x="time",by.y="date", all.x =T )
# removing columns with almost zero variance
pm10.cov <- pm10.cov %>% select(-nearZeroVar(pm10.cov)) 
# filtering the columns we need
pm10.cov2 <- pm10.cov %>% select(c(1,9:13))
# filling NA with 0 for the dates before COVID19
pm10.cov2[,3:6] <- sapply(pm10.cov2[,3:6], function(x) ifelse(is.na(x),0,x))

Correlation matrix

# creating a function to setup the geom_smooth graph of ggpair plot
my_fn <- function(data, mapping, method="loess", ...){
  p <- ggplot(data = data, mapping = mapping) + 
    geom_point() + 
    geom_smooth(method=method, color="black", ...)
  p
}

ggpairs(pm10.cov2[,2:6], aes(color="red", alpha=0.3), lower=list(continuous=my_fn), 
        upper = list(continuous = wrap("cor", size = 3))) + theme_bw()

creating the corrplot

We can see that there is no clear correlation between PM10 and COVID19. The correlation values are close to zero so there is almost zero impact of coronavirus to values of PM10.

cor.pm10 <- cor(pm10.cov2[,2:6])
corrplot.mixed(cor.pm10)

Join the nox and covid database

nox$time <- ymd(nox$time)
# left join between nox and covid dataset
nox.cov <- merge(nox, covid, by.x="time",by.y="date", all.x =T )
nox.cov <- nox.cov %>% select(-nearZeroVar(nox.cov)) 
nox.cov2 <- nox.cov %>% select(c(1,11:15))
nox.cov2[,3:6] <- sapply(nox.cov2[,3:6], function(x) ifelse(is.na(x),0,x))

Correlation matrix between mean value of NOx and COVID19 values

ggpairs(nox.cov2[,2:6], aes(color="red", alpha=0.3), lower=list(continuous=my_fn), 
        upper = list(continuous = wrap("cor", size = 3))) + theme_bw()

We can see that we have a negative correlation between NOx value and total cases of coronavirus. That negative correlation means that when total cases increase the NOx mean value descreases. So there is COVID19 impact in the NOx values.

cor.nox <- cor(nox.cov2[,2:6])
corrplot.mixed(cor.nox)